{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e384109d",
   "metadata": {},
   "source": [
    "# 数据泄露\n",
    "#### 目标泄露\n",
    "#### 训练测试污染"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "7c8ab242",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of rows in the dataset: 1319\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>reports</th>\n",
       "      <th>age</th>\n",
       "      <th>income</th>\n",
       "      <th>share</th>\n",
       "      <th>expenditure</th>\n",
       "      <th>owner</th>\n",
       "      <th>selfemp</th>\n",
       "      <th>dependents</th>\n",
       "      <th>months</th>\n",
       "      <th>majorcards</th>\n",
       "      <th>active</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>37.66667</td>\n",
       "      <td>4.5200</td>\n",
       "      <td>0.033270</td>\n",
       "      <td>124.983300</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>3</td>\n",
       "      <td>54</td>\n",
       "      <td>1</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>33.25000</td>\n",
       "      <td>2.4200</td>\n",
       "      <td>0.005217</td>\n",
       "      <td>9.854167</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>3</td>\n",
       "      <td>34</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>33.66667</td>\n",
       "      <td>4.5000</td>\n",
       "      <td>0.004156</td>\n",
       "      <td>15.000000</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>4</td>\n",
       "      <td>58</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>30.50000</td>\n",
       "      <td>2.5400</td>\n",
       "      <td>0.065214</td>\n",
       "      <td>137.869200</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>0</td>\n",
       "      <td>25</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>32.16667</td>\n",
       "      <td>9.7867</td>\n",
       "      <td>0.067051</td>\n",
       "      <td>546.503300</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>2</td>\n",
       "      <td>64</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   reports       age  income     share  expenditure  owner  selfemp  \\\n",
       "0        0  37.66667  4.5200  0.033270   124.983300   True    False   \n",
       "1        0  33.25000  2.4200  0.005217     9.854167  False    False   \n",
       "2        0  33.66667  4.5000  0.004156    15.000000   True    False   \n",
       "3        0  30.50000  2.5400  0.065214   137.869200  False    False   \n",
       "4        0  32.16667  9.7867  0.067051   546.503300   True    False   \n",
       "\n",
       "   dependents  months  majorcards  active  \n",
       "0           3      54           1      12  \n",
       "1           3      34           1      13  \n",
       "2           4      58           1       5  \n",
       "3           0      25           1       7  \n",
       "4           2      64           1       5  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# 读取数据\n",
    "# true_values : list, default None\n",
    "# Values to consider as True\n",
    "# false_values : list, default None\n",
    "# Values to consider as False\n",
    "data = pd.read_csv('../../datasets/AER_credit_card_data.csv', \n",
    "                   true_values = ['yes'], false_values = ['no'])\n",
    "\n",
    "# 选择目标\n",
    "y = data.card\n",
    "\n",
    "# 选择预测\n",
    "X = data.drop(['card'], axis=1)\n",
    "\n",
    "print(\"Number of rows in the dataset:\", X.shape[0])\n",
    "# 用head()瞄一眼X的数据\n",
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "3a316d6e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cross-validation accuracy: 0.980294\n"
     ]
    }
   ],
   "source": [
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "# 因为没有预处理，所以我们不需要pipeline\n",
    "my_pipeline = make_pipeline(RandomForestClassifier(n_estimators=100))\n",
    "cv_scores = cross_val_score(my_pipeline, X, y, \n",
    "                            cv=5,\n",
    "                            scoring='accuracy')\n",
    "\n",
    "print(\"Cross-validation accuracy: %f\" % cv_scores.mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a3651ab5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fraction of those who did not receive a card and had no expenditures: 1.00\n",
      "Fraction of those who received a card and had no expenditures: 0.02\n"
     ]
    }
   ],
   "source": [
    "# 支出是指此卡上的支出还是申请之前使用的卡片上的支出\n",
    "expenditures_cardholders = X.expenditure[y]\n",
    "expenditures_noncardholders = X.expenditure[~y]\n",
    "\n",
    "print('Fraction of those who did not receive a card and had no expenditures: %.2f' \\\n",
    "      %((expenditures_noncardholders == 0).mean()))\n",
    "print('Fraction of those who received a card and had no expenditures: %.2f' \\\n",
    "      %(( expenditures_cardholders == 0).mean()))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fc0aa754",
   "metadata": {},
   "source": [
    "##### 从上面数据中可得出：\n",
    "没有申请卡且无消费的的占比为100%，有卡但无消费的占比仅为2%。大概率为目标泄露\n",
    "#### 下面尝试运行一个无数据泄露的模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "95c3ce4e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cross-val accuracy: 0.830156\n"
     ]
    }
   ],
   "source": [
    "# 从数据集中删除潜在泄漏数据\n",
    "potential_leaks = ['expenditure', 'share', 'active', 'majorcards']\n",
    "X2 = X.drop(potential_leaks, axis=1)\n",
    "\n",
    "# Evaluate the model with leaky predictors removed\n",
    "cv_scores = cross_val_score(my_pipeline, X2, y, \n",
    "                            cv=5,\n",
    "                            scoring='accuracy')\n",
    "\n",
    "print(\"Cross-val accuracy: %f\" % cv_scores.mean())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
